# Import required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# Import dataset
df = pd.read_csv("D:/CSVs/netflix_titles.csv", encoding= 'unicode_escape')
df.shape
(8807, 12)
# Basic overview of dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8807 entries, 0 to 8806 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 6173 non-null object 4 cast 7982 non-null object 5 country 7976 non-null object 6 date_added 8797 non-null object 7 release_year 8807 non-null int64 8 rating 8803 non-null object 9 duration 8804 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object dtypes: int64(1), object(11) memory usage: 825.8+ KB
# Top 5 records of dataset
df.head(5)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
# Renaming columns of dataset (according to your habbit/comfort)
# Makes it easy while ploting
df.rename(columns={'release_year':'ReleaseYear','show_id':'ShowId','type':'Type','title':'Title','director':'Director','csat':'Cast','country':'country','date_added':'DateAdded','rating':'Ratings','duration':'Duration','listed_in':'ListedIn','description':'Description'},inplace=True)
df.head(5)
| ShowId | Type | Title | Director | cast | country | DateAdded | ReleaseYear | Ratings | Duration | ListedIn | Description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
# Stats about the dataset
df.describe()
| ReleaseYear | |
|---|---|
| count | 8807.000000 |
| mean | 2014.180198 |
| std | 8.819312 |
| min | 1925.000000 |
| 25% | 2013.000000 |
| 50% | 2017.000000 |
| 75% | 2019.000000 |
| max | 2021.000000 |
# Display column names in dataset
df.columns
Index(['ShowId', 'Type', 'Title', 'Director', 'cast', 'country', 'DateAdded',
'ReleaseYear', 'Ratings', 'Duration', 'ListedIn', 'Description'],
dtype='object')
df1=df[['Type','Title','Director','cast','country','ReleaseYear','Ratings','Duration','ListedIn']]
df1
| Type | Title | Director | cast | country | ReleaseYear | Ratings | Duration | ListedIn | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | 2020 | PG-13 | 90 min | Documentaries |
| 1 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries |
| 2 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... |
| 3 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | 2021 | TV-MA | 1 Season | Docuseries, Reality TV |
| 4 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8802 | Movie | Zodiac | David Fincher | Mark Ruffalo, Jake Gyllenhaal, Robert Downey J... | United States | 2007 | R | 158 min | Cult Movies, Dramas, Thrillers |
| 8803 | TV Show | Zombie Dumb | NaN | NaN | NaN | 2018 | TV-Y7 | 2 Seasons | Kids' TV, Korean TV Shows, TV Comedies |
| 8804 | Movie | Zombieland | Ruben Fleischer | Jesse Eisenberg, Woody Harrelson, Emma Stone, ... | United States | 2009 | R | 88 min | Comedies, Horror Movies |
| 8805 | Movie | Zoom | Peter Hewitt | Tim Allen, Courteney Cox, Chevy Chase, Kate Ma... | United States | 2006 | PG | 88 min | Children & Family Movies, Comedies |
| 8806 | Movie | Zubaan | Mozez Singh | Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan... | India | 2015 | TV-14 | 111 min | Dramas, International Movies, Music & Musicals |
8807 rows × 9 columns
# Checking null values
pd.isnull(df1)
| Type | Title | Director | cast | country | ReleaseYear | Ratings | Duration | ListedIn | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | True | False | False | False | False | False |
| 1 | False | False | True | False | False | False | False | False | False |
| 2 | False | False | False | False | True | False | False | False | False |
| 3 | False | False | True | True | True | False | False | False | False |
| 4 | False | False | True | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8802 | False | False | False | False | False | False | False | False | False |
| 8803 | False | False | True | True | True | False | False | False | False |
| 8804 | False | False | False | False | False | False | False | False | False |
| 8805 | False | False | False | False | False | False | False | False | False |
| 8806 | False | False | False | False | False | False | False | False | False |
8807 rows × 9 columns
# Sum of null values
pd.isnull(df1).sum()
Type 0 Title 0 Director 2634 cast 825 country 831 ReleaseYear 0 Ratings 4 Duration 3 ListedIn 0 dtype: int64
df1.fillna({'Director':'Unavailable'},inplace=True)
C:\Users\hp\AppData\Local\Temp\ipykernel_19192\1365858580.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.fillna({'Director':'Unavailable'},inplace=True)
# Filling null values with sutaible values
df1.fillna({'Director':'Not available'},inplace=True)
C:\Users\hp\AppData\Local\Temp\ipykernel_19192\4170769459.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.fillna({'Director':'Not available'},inplace=True)
df1.fillna({'cast':'Others'},inplace=True)
C:\Users\hp\AppData\Local\Temp\ipykernel_19192\3999689768.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.fillna({'cast':'Others'},inplace=True)
df1.fillna({'country':'Others'},inplace=True)
C:\Users\hp\AppData\Local\Temp\ipykernel_19192\3410488415.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.fillna({'country':'Others'},inplace=True)
df1.fillna({'DateAdded':'Not availabe'},inplace=True)
df1.fillna({'Ratings':'Not availabe'},inplace=True)
C:\Users\hp\AppData\Local\Temp\ipykernel_19192\3716011444.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1.fillna({'Ratings':'Not availabe'},inplace=True)
df1.fillna({'duration':'Not availabe'},inplace=True)
#dropna can also be used here but dropna drops columns having missing values instead of missing values
#Droping null values
#df.dropna(inplace=True)
# To check weather null values are removed or not
pd.isnull(df1).sum()
Type 0 Title 0 Director 0 cast 0 country 0 ReleaseYear 0 Ratings 0 Duration 3 ListedIn 0 dtype: int64
df1.shape
(8807, 9)
df1.to_excel('df1.xlsx', index=False)
# Get type of shows from dataset
df1.Type
0 Movie
1 TV Show
2 TV Show
3 TV Show
4 TV Show
...
8802 Movie
8803 TV Show
8804 Movie
8805 Movie
8806 Movie
Name: Type, Length: 8807, dtype: object
# Total count of types of shows
df1.Type.value_counts()
Movie 6131 TV Show 2676 Name: Type, dtype: int64
# Countplot of types of shows
cp = sns.countplot(x='Type',data=df1)
# Adding labels to bars in plot
for bars in cp.containers:
cp.bar_label(bars)
sns.set(rc={'figure.figsize':(10,5)})
# Adding title & display
plt.title('Type of shows')
plt.show()
# Count of ratings
df1.Ratings.value_counts().head(10)
TV-MA 3207 TV-14 2160 TV-PG 863 R 799 PG-13 490 TV-Y7 334 TV-Y 307 PG 287 TV-G 220 NR 80 Name: Ratings, dtype: int64
# pie chart of ratings given by top 10 rating systems
pc=df1['Ratings'].value_counts().head(10)[:10] # Index
circle=go.Figure(data=[go.Pie(labels= list(pc.index),values= list(pc.values))]) # takes parameters as labels & values
circle.update_layout(title_text="Overall ratings given by rating systems ",) # for title
circle.show()
# Countplot of ratings given by rating systems (type-wise)
cp1=sns.countplot(x='Ratings',data=df1,hue='Type')
# Adding labels to bars
for bars in cp1.containers:
cp1.bar_label(bars)
# Size of bars
sns.set(rc={'figure.figsize':(10,5)})
plt.title('type-wise ratings by rating systems')
plt.show()
# Count of country column(top 6 countries)
df1.country.value_counts().head(5+1)
United States 2818 India 972 Others 831 United Kingdom 419 Japan 245 South Korea 199 Name: country, dtype: int64
# Countplot of country-wise types of shows on netflix
# hue is used to split categorical values
cp2=sns.countplot(x='country',order=df['country'].value_counts().index[0:6],data=df1)
for bars in cp2.containers:
cp2.bar_label(bars)
sns.set(rc={'figure.figsize':(20,5)})
plt.title('Country-wise shows on Netlfix')
plt.show()
# Adding hue split is above plot
# hue is used to split categorical values
cp2=sns.countplot(x='country',order=df['country'].value_counts().index[0:6],data=df1,hue='Type')
for bars in cp2.containers:
cp2.bar_label(bars)
sns.set(rc={'figure.figsize':(20,5)})
plt.title('Country-wise shows on Netlfix')
plt.show()
# Countplot of types of show released every year
cp3=sns.countplot(data=df1,x='ReleaseYear',hue='Type')
sns.set(rc={'figure.figsize':(15,5)})
for bars in cp3.containers:
cp3.bar_label(bars)
plt.title('Types of shows released every year')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# Pie chart of country-wise shows production(in percent)
pc1=df1['country'].value_counts()[:10]
# go.figure creates new figure that will hold visualization
# go.pie is used to create pie chart that take values and labels parameters
fig1=go.Figure(data=[go.Pie(labels= list(pc1.index),values= list(pc1.values),hole=0.3)])
# update_layout is used to set title
fig1.update_layout(title_text="Top 10 country-wise production ")
fig1.show()
watched = df1.groupby(['Type'],as_index=False)['ReleaseYear'].sum().sort_values(by='ReleaseYear', ascending=False)
watched
| Type | ReleaseYear | |
|---|---|---|
| 0 | Movie | 12342448 |
| 1 | TV Show | 5396437 |
# Barplot of types and releaseyear
watched1 = sns.barplot(x='Type',y='ReleaseYear',data=watched)
for bars in watched1.containers:
watched1.bar_label(bars)
sns.set(rc={'figure.figsize':(5,5)})
# the number is 1.0435 multiplied by 10 to the power of 7.
# 1.0435e+07 is a scientific notation or exponential notation that represents a number
# grouping based on types, release year and count of titles
grouped = (df.groupby(['Type','ReleaseYear'])['Title'].size()).reset_index()
grouped
| Type | ReleaseYear | Title | |
|---|---|---|---|
| 0 | Movie | 1942 | 2 |
| 1 | Movie | 1943 | 3 |
| 2 | Movie | 1944 | 3 |
| 3 | Movie | 1945 | 3 |
| 4 | Movie | 1946 | 1 |
| ... | ... | ... | ... |
| 114 | TV Show | 2017 | 265 |
| 115 | TV Show | 2018 | 380 |
| 116 | TV Show | 2019 | 397 |
| 117 | TV Show | 2020 | 436 |
| 118 | TV Show | 2021 | 315 |
119 rows × 3 columns
# Lineplot to compare types of shows year-wise
lp = sns.set(rc={'figure.figsize':(15,5)})
sns.lineplot(data = grouped, x = 'ReleaseYear', y = 'Title', hue = 'Type')
plt.title('Year-wise total content')
plt.show()
1) Count of titles has increased significantly over the years. 2) Specifically, two type of titles are available on Netflix:'Movies' & 'Tv-shows' 3) Top most rating system is 'TV-MA' which has rated over 36.7% show (3207). 4) United States is the country with most release on Netflix with record of 2818 total shows followed by India, United Kingdom, Japan & South Korea. 5) United States has the largest production count with 46.6% 6) If you see the comparsiosn chart theres significant growth graph of TV-Shows during the year 2019-2020, covid pandamic is one of the biggest reasons for this change.